from vllm import LLM, SamplingParams
from PIL import Image

# 模型下载路径 https://huggingface.co/llava-hf/llava-1.5-7b-hf
# limit_mm_per_prompt={"image": 1} 用于控制每个请求的图片数，这里表示
# 一个请求最多只能包含一张图片
llm = LLM(model="llava-hf/llava-1.5-7b-hf", limit_mm_per_prompt={"image": 1})
sampling_params = SamplingParams(max_tokens=256)

prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit here?\nASSISTANT:"

image  = Image.open("view.jpg")

outputs = llm.generate(
    {
        "prompt": prompt,
        "multi_modal_data": {"image": image},
    },
    sampling_params=sampling_params,
)

for o in outputs:
    generated_text = o.outputs[0].text
    print(generated_text)